Package Import

In [1]:
pip install -r requirements.txt
In [2]:
import sqlite3
import pycountry
import datetime
import numpy as np
import pandas as pd
import plotly.express as px

Data Ingestion, Cleaning and Feature Engineering

In [3]:
df = pd.read_csv("Data.csv")

df["Cases per 100,000 people"] = (df["Cases"] / (df["Population"] / 100000)).apply(lambda x: round(x,2))
df["Date"] = pd.to_datetime(df["Date"])
df["Deaths per 1000 Cases"] = (df["Deaths"] / df["Cases"] * 1000).apply(lambda x: round(x,2))

df.rename({"Entity": "Country"}, axis=1, inplace=True)

country_rename_dict = {"Bolivia": "Bolivia, Plurinational State of", 
                       "Cape Verde": "Cabo Verde", 
                       "Iran": "Iran, Islamic Republic of", 
                       "Russia": "Russian Federation", 
                       "South Korea": "Korea, Republic of",
                       "Vietnam": "Viet Nam"}

df["Country"].replace(country_rename_dict, inplace=True)

countries = {}

for country in pycountry.countries:
    countries[country.name] = country.alpha_3
    
df["Country Code"] = df["Country"].apply(lambda x: countries.get(x))

df
Out[3]:
Country Continent Latitude Longitude Average temperature per year Hospital beds per 1000 people Medical doctors per 1000 people GDP/Capita Population Median age Population aged 65 and over (%) Date Daily tests Cases Deaths Cases per 100,000 people Deaths per 1000 Cases Country Code
0 Albania Europe 41.15 20.17 14 2.89 1.29 5353.2 2873457 38 14 2020-02-25 8.0 NaN NaN NaN NaN ALB
1 Albania Europe 41.15 20.17 14 2.89 1.29 5353.2 2873457 38 14 2020-02-26 5.0 NaN NaN NaN NaN ALB
2 Albania Europe 41.15 20.17 14 2.89 1.29 5353.2 2873457 38 14 2020-02-27 4.0 NaN NaN NaN NaN ALB
3 Albania Europe 41.15 20.17 14 2.89 1.29 5353.2 2873457 38 14 2020-02-28 1.0 NaN NaN NaN NaN ALB
4 Albania Europe 41.15 20.17 14 2.89 1.29 5353.2 2873457 38 14 2020-02-29 8.0 NaN NaN NaN NaN ALB
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
38467 Zimbabwe Africa -19.02 29.15 20 1.70 0.08 1464.0 16529904 19 3 2021-02-24 1804.0 35960.0 1456.0 217.55 40.49 ZWE
38468 Zimbabwe Africa -19.02 29.15 20 1.70 0.08 1464.0 16529904 19 3 2021-02-25 2965.0 35994.0 1458.0 217.75 40.51 ZWE
38469 Zimbabwe Africa -19.02 29.15 20 1.70 0.08 1464.0 16529904 19 3 2021-02-26 NaN 36044.0 1463.0 218.05 40.59 ZWE
38470 Zimbabwe Africa -19.02 29.15 20 1.70 0.08 1464.0 16529904 19 3 2021-02-27 NaN 36058.0 1463.0 218.14 40.57 ZWE
38471 Zimbabwe Africa -19.02 29.15 20 1.70 0.08 1464.0 16529904 19 3 2021-02-28 NaN 36089.0 1463.0 218.33 40.54 ZWE

38472 rows × 18 columns

In [4]:
df.isna().sum()
Out[4]:
Country                               0
Continent                             0
Latitude                              0
Longitude                             0
Average temperature per year          0
Hospital beds per 1000 people         0
Medical doctors per 1000 people       0
GDP/Capita                            0
Population                            0
Median age                            0
Population aged 65 and over (%)       0
Date                                  0
Daily tests                        7895
Cases                               254
Deaths                             3610
Cases per 100,000 people            254
Deaths per 1000 Cases              3610
Country Code                          0
dtype: int64
In [5]:
df.fillna(0, inplace=True)

Export to SQLite DB as tables

In [6]:
country_dim_table_df = df[["Country",
                           "Country Code",
                           "Continent",
                           "Latitude",
                           "Longitude",
                           "Average temperature per year",
                           "Hospital beds per 1000 people",
                           "Medical doctors per 1000 people",
                           "GDP/Capita",
                           "Population",
                           "Median age",
                           "Population aged 65 and over (%)"]].drop_duplicates()
                  
country_dim_table_df
Out[6]:
Country Country Code Continent Latitude Longitude Average temperature per year Hospital beds per 1000 people Medical doctors per 1000 people GDP/Capita Population Median age Population aged 65 and over (%)
0 Albania ALB Europe 41.15 20.17 14 2.89 1.29 5353.2 2873457 38 14
370 Algeria DZA Africa 28.03 1.66 25 1.90 1.83 3974.0 41318142 29 6
740 Argentina ARG South America -38.42 -63.62 14 5.00 3.91 9912.3 44271041 31 11
1144 Armenia ARM Europe 40.07 45.04 11 4.20 2.80 4622.7 2930450 35 11
1509 Australia AUS Oceania -25.27 133.78 22 3.84 3.50 55060.3 24598933 37 16
... ... ... ... ... ... ... ... ... ... ... ... ...
36618 United States USA North America 37.09 -95.71 11 2.77 2.57 65297.5 325719178 38 16
37022 Uruguay URY South America -32.52 -55.77 16 2.80 3.74 16190.1 3456750 35 15
37375 Viet Nam VNM Asia 14.06 108.28 25 2.60 0.82 2715.3 95540800 32 7
37778 Zambia ZMB Africa -13.13 27.85 21 2.00 0.09 1305.1 17094130 17 2
38126 Zimbabwe ZWE Africa -19.02 29.15 20 1.70 0.08 1464.0 16529904 19 3

104 rows × 12 columns

In [7]:
case_fact_table_df = df[["Country",
                         "Date",
                         "Daily tests",
                         "Cases",
                         "Deaths",
                         "Cases per 100,000 people",
                         "Deaths per 1000 Cases"]]

case_fact_table_df
Out[7]:
Country Date Daily tests Cases Deaths Cases per 100,000 people Deaths per 1000 Cases
0 Albania 2020-02-25 8.0 0.0 0.0 0.00 0.00
1 Albania 2020-02-26 5.0 0.0 0.0 0.00 0.00
2 Albania 2020-02-27 4.0 0.0 0.0 0.00 0.00
3 Albania 2020-02-28 1.0 0.0 0.0 0.00 0.00
4 Albania 2020-02-29 8.0 0.0 0.0 0.00 0.00
... ... ... ... ... ... ... ...
38467 Zimbabwe 2021-02-24 1804.0 35960.0 1456.0 217.55 40.49
38468 Zimbabwe 2021-02-25 2965.0 35994.0 1458.0 217.75 40.51
38469 Zimbabwe 2021-02-26 0.0 36044.0 1463.0 218.05 40.59
38470 Zimbabwe 2021-02-27 0.0 36058.0 1463.0 218.14 40.57
38471 Zimbabwe 2021-02-28 0.0 36089.0 1463.0 218.33 40.54

38472 rows × 7 columns

In [8]:
conn = sqlite3.connect("sqlite3_db")

c = conn.cursor()
c.execute("CREATE TABLE IF NOT EXISTS country_dim_table ('')")
c.execute("CREATE TABLE IF NOT EXISTS case_fact_table ('')")

conn.commit()

country_dim_table_df.to_sql("country_dim_table", conn, if_exists="replace", index= False)
case_fact_table_df.to_sql("case_fact_table", conn, if_exists="replace", index= False)

Visuals

Cases per 100,000 people by Country (Map)

In [9]:
df1_cols = ["Date", "Country", "Country Code", "Cases per 100,000 people"]

c.execute("""
          SELECT f.[Date],
                 f.[Country], 
                 d.[Country Code], 
                 f.[Cases per 100,000 people]
          FROM case_fact_table f, country_dim_table d
          WHERE f.Country = d.Country
          """)

df1 = pd.DataFrame(c.fetchall(), columns=df1_cols)    

df_cty_cse = df1.groupby("Country").last().reset_index()
df_cty_cse
Out[9]:
Country Date Country Code Cases per 100,000 people
0 Albania 2021-02-28 00:00:00 ALB 3729.55
1 Algeria 2021-02-28 00:00:00 DZA 273.71
2 Argentina 2021-02-28 00:00:00 ARG 4760.14
3 Armenia 2021-02-28 00:00:00 ARM 5871.38
4 Australia 2021-02-28 00:00:00 AUS 117.80
... ... ... ... ...
99 United States 2021-02-28 00:00:00 USA 8782.31
100 Uruguay 2021-02-28 00:00:00 URY 1677.70
101 Viet Nam 2021-02-28 00:00:00 VNM 2.56
102 Zambia 2021-02-28 00:00:00 ZMB 459.42
103 Zimbabwe 2021-02-28 00:00:00 ZWE 218.33

104 rows × 4 columns

In [10]:
fig = px.choropleth(df_cty_cse,
                    title=f"Number of COVID-19 cases per 100,000 people by Country as of {pd.to_datetime(max(df_cty_cse['Date'])).strftime('%d-%b-%Y')}",
                    locations="Country Code",                    
                    hover_name="Country",
                    color="Cases per 100,000 people",
                    color_continuous_scale="Reds")

fig.show()

Deaths per 1000 cases (Bar Chart)

In [11]:
df2_cols = ["Date",
            "Deaths per 1000 Cases", 
            "Medical doctors per 1000 people", 
            "Country", 
            "Continent", 
            "GDP/Capita"]

c.execute("""
          SELECT f.[Date],
                 f.[Deaths per 1000 Cases], 
                 d.[Medical doctors per 1000 people], 
                 d.[Country], 
                 d.[Continent], 
                 d.[GDP/Capita]                 
          FROM case_fact_table f, country_dim_table d
          WHERE f.Country = d.Country
          """)

df2 = pd.DataFrame(c.fetchall(), columns=df2_cols)

df_cty_dth_cse = df2.groupby("Country")\
                    .last()\
                    .sort_values(by="Deaths per 1000 Cases", ascending=True)\
                    .reset_index()

df_cty_dth_cse
Out[11]:
Country Date Deaths per 1000 Cases Medical doctors per 1000 people Continent GDP/Capita
0 Mongolia 2021-02-28 00:00:00 0.68 3.26 Asia 4339.8
1 Bhutan 2021-02-28 00:00:00 1.15 0.38 Asia 3316.2
2 Qatar 2021-02-28 00:00:00 1.58 1.96 Asia 62088.1
3 United Arab Emirates 2021-02-28 00:00:00 3.12 1.56 Asia 43103.3
4 Thailand 2021-02-28 00:00:00 3.19 0.47 Asia 7806.7
... ... ... ... ... ... ...
99 Zimbabwe 2021-02-28 00:00:00 40.54 0.08 Africa 1464.0
100 Bulgaria 2021-02-28 00:00:00 41.25 4.00 Europe 9828.1
101 Bolivia, Plurinational State of 2021-02-28 00:00:00 46.78 0.47 South America 3552.1
102 Ecuador 2021-02-28 00:00:00 55.25 1.66 South America 6183.8
103 Mexico 2021-02-28 00:00:00 88.99 2.23 North America 9946.0

104 rows × 6 columns

In [12]:
fig = px.bar(df_cty_dth_cse, 
             x="Deaths per 1000 Cases", 
             y="Country", 
             color="GDP/Capita", 
             orientation="h",
             hover_data=["Deaths per 1000 Cases", "Medical doctors per 1000 people"],
             height=2000,
             title=f"Deaths per 1000 COVID-19 Cases by Country as of {pd.to_datetime(max(df_cty_dth_cse['Date'])).strftime('%d-%b-%Y')} (Bars coloured by GDP per Capita)",
             color_continuous_scale="Reds")

fig.show()

Case Numbers over time (Line)

In [13]:
df3_cols = ["Date", "Cases", "Country", "Continent"]

c.execute("""
          SELECT f.[Date],
                 f.[Cases], 
                 d.[Country], 
                 d.[Continent]             
          FROM case_fact_table f, country_dim_table d
          WHERE f.Country = d.Country
          """)

df_cty_cse_time = pd.DataFrame(c.fetchall(), columns=df3_cols)
df_cty_cse_time
Out[13]:
Date Cases Country Continent
0 2020-02-25 00:00:00 0.0 Albania Europe
1 2020-02-26 00:00:00 0.0 Albania Europe
2 2020-02-27 00:00:00 0.0 Albania Europe
3 2020-02-28 00:00:00 0.0 Albania Europe
4 2020-02-29 00:00:00 0.0 Albania Europe
... ... ... ... ...
38467 2021-02-24 00:00:00 35960.0 Zimbabwe Africa
38468 2021-02-25 00:00:00 35994.0 Zimbabwe Africa
38469 2021-02-26 00:00:00 36044.0 Zimbabwe Africa
38470 2021-02-27 00:00:00 36058.0 Zimbabwe Africa
38471 2021-02-28 00:00:00 36089.0 Zimbabwe Africa

38472 rows × 4 columns

In [14]:
fig = px.line(df_cty_cse_time, 
              x="Date", 
              y="Cases",
              title="COVID-19 Cases over time by Country (Select continent from dropdown)",
              color="Country", 
              custom_data=["Continent"])

fig.update_layout(updatemenus = [{"buttons": [{"label":"All continents", 
                                               "method": "restyle", 
                                               "args": [{"visible": True}]}] + 
                                             [{"label": t,
                                               "method": "restyle",
                                               "args": [{"visible": [np.unique(t2.customdata)[0] == t for t2 in fig.data]}],
                                              }
                                              for t in df["Continent"].unique()
                                             ]}])

fig.show()